library(TSA)
library(hts)
library(tidyverse)
library(lubridate)
library(prophet)
library(ggplot2)
library(plotly)
rm(list = ls())
gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2284636 122.1    4397135 234.9         NA  4321059 230.8
## Vcells 3902263  29.8    8388608  64.0      16384  6005348  45.9

Import Data

datapath = "/Volumes/SSD/TS-Project/data"
train = read.csv(file=paste(datapath,"subset/train_city_type_item.csv",sep="/"))

Transform to wide form

train_wide = spread(train, hts_label, tot_sales)
train_wide[is.na(train_wide)] = 0
train_wide = train_wide %>%
                  select(-city, -type, -family, -price, -n_promotion, 
                         -perishable, -national_flag, -pay.day_flag) %>%
                  group_by(date) %>%
                  summarise_all(funs(sum))
train.ts = ts(train_wide[2:ncol(train_wide)], frequency=365.25, 
     start = c(year("2013-01-01"), 1))

SMAPE Function

smape = function(y_pred, y_actual) {
  error = mean(abs(y_pred - y_actual)/(abs(y_actual) + abs(y_pred)))
  return(error)
}

Predict Without Holdays

# Adding total sales column to data and prepare for prophet
train.prophet <- train_wide[,-2]
Tot_Sales <- rowSums(train.prophet[,-c(1,2)])
holidays <- train.prophet[,c(1,2)]
train.prophet <- cbind(train.prophet[,1],Tot_Sales)

# Split into training and validation (use last 20 days as validation data)
n <- nrow(train.prophet)
train.valid <- train.prophet[(n-19):n, ]
colnames(train.valid) <- c("date","y")
train.valid$date <- as.Date(train.valid$date)
train.real <- train.prophet[1:(n-20), ]

# Preparing for prophet
train.real$date <- as.Date(train.real$date)
colnames(train.real) <- c("ds","y")

# Running prophet with no parameters set
train.pro <- prophet(train.real)
future <- make_future_dataframe(train.pro, periods = 20)
result_pro <- predict(train.pro,future)

# plot to see how well the model predicted validation data
# red is predicted; black is actual
plot1 <- ggplot()+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
  scale_x_date(labels = scales::date_format("%b %d"),
    breaks = scales::date_breaks("1 day"))+
  theme_minimal()+
  theme(
    axis.text.x = element_text(angle = 45),
    legend.position = "bottom"
  )

ggplotly(plot1 , dynamicTicks = TRUE)
# Making the model better

# 1. Taking holiday into account; I already constructed a holidays table earlier
colnames(holidays) <- c("ds","holiday")
holidays$holiday <- as.character(holidays$holiday)

train.pro.new <- prophet(train.real,holidays = holidays)
future <- make_future_dataframe(train.pro.new, periods = 20)
result.pro.new <- predict(train.pro.new,future)

# plot to see if the model improved
# black is actual; green is old; red is new
plot2 <- ggplot()+
geom_line(data = result.pro.new %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
  scale_x_date(labels = scales::date_format("%b %d"),
    breaks = scales::date_breaks("1 day"))+
  theme_minimal()+
  theme(
    axis.text.x = element_text(angle = 45),
    legend.position = "bottom"
  )

ggplotly(plot2 , dynamicTicks = TRUE)
# The model did worse, so I'm going to remove holiday data from the model
# 2. Changing seasonality.mode to multiplicative
train.pro.new2 <- prophet(train.real, seasonality.mode = "multiplicative")
future <- make_future_dataframe(train.pro.new2, periods = 20)
result.pro.new2 <- predict(train.pro.new2,future)

# plot to see if the model improved
# black is actual; green is old; red is new
plot2 <- ggplot()+
geom_line(data = result.pro.new2 %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result.pro.new %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
  scale_x_date(labels = scales::date_format("%b %d"),
    breaks = scales::date_breaks("1 day"))+
  theme_minimal()+
  theme(
    axis.text.x = element_text(angle = 45),
    legend.position = "bottom"
  )

ggplotly(plot2 , dynamicTicks = TRUE)
# 2. Change point scale
# reference: https://www.kaggle.com/holdenyau/prophet/code
result <- data.frame(scale = 1:10 , traing_RMSE = 1: 10 , valid_RMSE = 1:10)
for (i in 1:10){
  train.pro.opt <- prophet(train.real, changepoint.prior.scale = i/100)
  future <- make_future_dataframe(train.pro.opt, periods = 20)
result.pro.opt <- predict( train.pro.opt , future)  
 v_traing <- accuracy(result.pro.opt$yhat[1:1659] , train.real$y )[2]
 v_valid  <- accuracy(result.pro.opt$yhat[1660:1679] , train.valid$y )[2]
 result$scale[i] <- i/100
 result$traing_RMSE[i] <- v_traing
 result$valid_RMSE[i] <- v_valid
}

result %>%
  gather(`traing_RMSE` , `valid_RMSE` , key = "type" , value = "RMSE") %>%
  ggplot()+
    geom_point(aes(x = scale , y = RMSE , col = type))+
    geom_line( aes(x = scale , y = RMSE , group = type))+
  theme_minimal()+
  theme(legend.position = "bottom")

plot3 <- ggplot()+
geom_line(data = result.pro.opt %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "red")+
geom_line(data = result_pro %>% tail(20),aes(x = as.Date(ds) , y = yhat),col = "green")+
geom_line(data = train.valid, aes(x = date, y = y) ,col = "black")+
  scale_x_date(labels = scales::date_format("%b %d"),
    breaks = scales::date_breaks("1 day"))+
  theme_minimal()+
  theme(
    axis.text.x = element_text(angle = 45),
    legend.position = "bottom"
  )

ggplotly(plot3 , dynamicTicks = TRUE)

Smape of validation data

smape(train.valid$y,result_pro$yhat %>% tail(20))
## [1] 0.03945145
smape(train.valid$y,result.pro.opt$yhat %>% tail(20))
## [1] 0.03955245
# Preparing proportion table for splitting forecast data to sub time series
train.proportion <- train_wide[,-c(2,3)]
Tot_Sales <- rowSums(train.proportion[,-1])
train.proportion <- cbind(train.proportion, Tot_Sales)
for (i in 2:47){
  for (j in 1:1659){
    train.proportion[j,i] <- train.proportion[j,i]/train.proportion[j,47]
  }
}

# Calculating average proportion
train.proportion <- train.proportion[1:1659,]
avg.proportion <- colSums(train.proportion[,-1]/1659)

# Assigning forecasted value based on average and forecasted total sales
# We will only be using the last 20 rows of train.proportion later
for (i in 2:47){
  for (j in 1:1679){
    train.proportion[j,i] <- result_pro$yhat[j]*avg.proportion[i-1]
  }
}

smape.values <- data.frame()

# Calculating smape for each column
for (i in 4:48){
  smape.values <- rbind(smape.values, smape(as.numeric(unlist((train_wide[,i] %>% tail(20)))),train.proportion[,(i-2)] %>% tail(20)))
}

# Taking the average
smape.final <- colSums(smape.values)/45; smape.final
## X0.0888549625165144 
##           0.1524263

Cross-Validation Function

prophet.CV = function(y, window, h) {
  # y is a dataset with date and sales - prepped for prophet
  cv.error = matrix(ncol = 1)
  
  for (i in seq(1, nrow(y) - (window + h) + 1)) {
    # Setting the window
    train_start = i
    train_end = i + window - 1
    test_start = i + window
    test_end = i + window + h - 1
    
    # Test set
    train.valid <- y[test_start:test_end, ]
    colnames(train.valid) <- c("date","y")
    train.valid$date <- as.Date(train.valid$date)

    # Train set
    train.real <- y[train_start:train_end, ]
    train.real$date <- as.Date(train.real$date)
    colnames(train.real) <- c("ds","y")

    # Predict
    train.pro <- prophet(train.real)
    future <- make_future_dataframe(train.pro, periods = h)
    fc = predict(train.pro,future)
    print(paste("Done", as.character(i)))

    pred = tail(fc$yhat, h)
    acc = smape(pred, train.valid$y)
    cv.error = rbind(cv.error, acc)
  }
  
  return(cv.error)
}

# Adding total sales column to data and prepare for prophet
train.prophet <- train_wide[,-2]
Tot_Sales <- rowSums(train.prophet[,-c(1,2)])
holidays <- train.prophet[,c(1,2)]
train.prophet <- cbind(train.prophet[,1],Tot_Sales)

# Run CV
cv.acc = prophet.CV(y = train.prophet, window = 1650, h = 20)
## [1] "Done 1"
## [1] "Done 2"
## [1] "Done 3"
## [1] "Done 4"
## [1] "Done 5"
## [1] "Done 6"
## [1] "Done 7"
## [1] "Done 8"
## [1] "Done 9"
## [1] "Done 10"

CV Errors

# Mean SMAPE
mean(cv.acc, na.rm = T)
## [1] 0.04085337
# Standard Deviation of SMAPE
sd(cv.acc, na.rm = T)
## [1] 0.002066195